Section 2-2: Latent Semantic Analysis (LSA) Directing Word2Vec Models (Python Chunk)
First we considered LSA as a rapid alternative to Latent Direchlet Allocation (LDA). LSA provided a single value decomposition alternative to LDA. The concept of coherence was also useful in this case. Data analysis in this section began with removal of common English stop words. Sentences were then manually verified to ensure important terms were not removed. Stemming was then used to reduce words to their roots to reduce the effects of tense and voice on creating unnecessary divisions between effectively equivalent words. Given the restricted amount of data, use of a UMass algorithm was considered potentially superior to an arbitrary spit and training (i.e., test and training sets) and so this was implemented. We later look at performance of LDA. Note that to maximize the capacity for accurate extraction of topic number, a median of boot straps of topic number estimates was computed and used to develop the later models.
# importing all necessary modules
## Data management
import pandas as pd
## Gensim Main
import gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import common_texts
## Tokenizing
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize
import warnings
## Other
from nltk.corpus import stopwords
from gensim.test.utils import datapath
import re
import unicodedata
from tqdm import tqdm
import multiprocessing
import random
import xlrd
import openpyxl
from statistics import median
## More for LSA
### Gensim
import os.path
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
### sklearn
from sklearn import cluster, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
### For plotting if required
## import matplotlib.pyplot as plt
## The following resource was used to direct further analysis
## https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python
# Create functions
## For loading excel files
def load_excel(path,file_name):
"""
Bring in an excel file
"""
return pd.read_excel(os.path.join(path, file_name), index_col=0)
## For taking word units like paragraphs or sentences into word tokens
def process_tokens(input_text_units, target_column='sentences'):
"""
Input: What ever division of data is desired, paragraphs or sentences
Output: processed tokens for analysis
"""
## Tokenize
### https://medium.com/0xcode/tokenizing-words-and-sentences-using-nltk-in-python-a11e5d33c312
processed_tokens = []
### Create a tokenizer unless word_tokenize is used
#### tokenizer = RegexpTokenizer(r'\w+')
### Get usual english stop words
eng_stop = set(stopwords.words('english'))
### Create a Stemmer if desired
## Stemming: https://tartarus.org/martin/PorterStemmer/
p_stemmer = PorterStemmer()
for i in input_text_units[target_column]:
### clean and tokenize document string
### lower case attribute required for stemmer
raw = i.lower()
### tokenizer
tokens = word_tokenize(raw)
### remove stop words from tokens if desired
stopped_tokens = [i for i in tokens if not i in eng_stop]
### stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
### add tokens to list
processed_tokens.append(stemmed_tokens)
return processed_tokens
## For taking word units like paragraphs or sentences into word tokens without frills
def reprocess_tokens(input_text_units, target_column='sentences'):
"""
make just simple token lists
"""
## Tokenize
processed_tokens = []
for i in input_text_units[target_column]:
### clean and tokenize document string
### lower case attribute required for stemmer
raw = i.lower()
### tokenizer
tokens = word_tokenize(raw)
### put the tokens together
##linked_tokens = [i for i in tokens]
### add tokens to list
processed_tokens.append(tokens)
return processed_tokens
## detokenize for sklearn
### https://towardsdatascience.com/latent-semantic-analysis-deduce-the-hidden-topic-from-the-document-f360e8c0614b
### https://scikit-learn.org/
def detokenize_for_sk(input_tokens):
"""
takes the tokens back to mutated sentences
"""
detokenized_text = []
for i in range(len(input_tokens)):
t = ' '.join(input_tokens[i])
detokenized_text.append(t)
return detokenized_text
## Create A Document Term Matrix
def dictionary_DTM(clean_list):
"""
Create the dictionary and Document Term Matrix (DTM)
"""
# Create dictionary for courpus
dictionary = corpora.Dictionary(clean_list)
# Create Document Term Matrix using dictionary
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_list]
# generate LDA model
return dictionary,doc_term_matrix
## Create Latent Semantic Analysis Models
def create_lsa_model(clean_list,number_of_topics):
"""
Create LSA from the input text given a number of topics and number of words associated with a topic
"""
dictionary,DTM=dictionary_DTM(clean_list)
# generate LSA model
lsamodel = LsiModel(DTM, num_topics=number_of_topics, id2word = dictionary)
#print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
return lsamodel
## Find Coherence
def get_coherence_for_set_DTM(dictionary, DTM, clean_list, stop, step=1, start=2):
"""
find topic coherence and output models for use
"""
# Initialize
coherence_values = []
model_list = []
for num_topics in range(start, stop, step):
# generate LSA model
model = LsiModel(DTM, num_topics=num_topics, id2word = dictionary)
# store the model
model_list.append(model)
# compute coherence
## Multiple coherence techniques to choose from:
### 'u_mass', 'c_v', 'c_uci', 'c_npmi'
## https://radimrehurek.com/gensim/models/coherencemodel.html
## https://mimno.infosci.cornell.edu/papers/mimno-semantic-emnlp.pdf
## https://www.aclweb.org/anthology/D12-1087.pdf
## Selected Umass because it is rapid and
coherencemodel = CoherenceModel(model=model, texts=clean_list, dictionary=dictionary, coherence='u_mass')
# append coherence values
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
## Rep Modelling
def rep_coherence(dictionaryIn,DTMIn,tokensIn, num_iter = 10000):
"""
find the average topic selection
"""
coherence_lists = []
for iter_num in range(num_iter):
# print(iter_num)
modelList, cohere = get_coherence_for_set_DTM(dictionaryIn,
DTMIn,
tokensIn,
10)
max_value = max(cohere)
max_index = cohere.index(max_value)
coherence_lists.append(max_index)
return median(coherence_lists)
# SK learn
## Reference
### https://towardsdatascience.com/latent-semantic-analysis-deduce-the-hidden-topic-from-the-document-f360e8c0614b
## Obtain topics
def SVD_topic(dfInIt, numTopicsIn = 2):
"""
return words and topics
"""
## Create topic vector / list
topicHeadings = []
for num_topics_ind in range(1, numTopicsIn + 1):
topicHeadings.append("topic_" + str(num_topics_ind))
## Instantiate Vectorizer
vectorizer = TfidfVectorizer(smooth_idf=True)
## Instantiate Single Value Decomposition Model
svd_model_topic = TruncatedSVD(n_components=num_topics_ind, algorithm='randomized', n_iter=100, random_state=12345)
vectX = vectorizer.fit_transform(dfInIt['prep_sentences'])
lsaX = svd_model_topic.fit_transform(vectX)
topic_encoded_df = pd.DataFrame(lsaX, columns = topicHeadings)
topic_encoded_df["documents"] = dfInIt['prep_sentences']
topic_encoded_df["documents_raw"] = dfInIt['sentences']
topic_encoded_df["identifier"] = dfInIt['c_1']
dictionary = vectorizer.get_feature_names()
# Note the transpose
encoding_matrix = pd.DataFrame(svd_model_topic.components_, index = topicHeadings, columns = (dictionary)).T
encoding_matrix["word"] = dictionary
return topic_encoded_df, encoding_matrix
# Word2Vec
## Create the general model
def create_sg_model(sentsIn, columnFocus = 'prep_sentences', num_iter = 100):
"""
create skip gram models to find words commonly in the vacinity
"""
# initiate model
## use skip gram model as we wish to take a focal word and predict its context
modelX = Word2Vec(min_count=1, vector_size=50, workers=cores-1, window=5, sg=1, max_vocab_size=100000)
## get the tokens / words
tokIn = reprocess_tokens(sentsIn,columnFocus)
## build the vocabulary with the tokens
modelX.build_vocab(tokIn, update = False)
## train the model
modelX.train(tokIn,total_examples=modelX.corpus_count,epochs=num_iter)
return modelX
## Word2Vec Clustering Metrics
def find_silhouettes(w2vModel):
"""
uses skip gram models to find words commonly in the vacinity
"""
## Create Silhouettes Hold
silhouettesOut = []
clustersOut = []
for num_clusters_ind in range(2, 11):
# Set up kmeans model
kmeansOut = cluster.KMeans(n_clusters=num_clusters_ind,
random_state=12345)
# Capture dictionary
dictOut = w2vModel.wv.key_to_index.keys()
# Cluster
kmeansOut.fit(w2vModel.wv[dictOut])
# Get the Silhouette
silhouetteAveOut = metrics.silhouette_score(w2vModel.wv[dictOut],
kmeansOut.labels_,
metric='euclidean')
# Store the data
silhouettesOut.append(silhouetteAveOut)
clustersOut.append(num_clusters_ind)
# find the maximum silhouette
max_value = max(silhouettesOut)
max_index = silhouettesOut.index(max_value)
return clustersOut[max_index]
# Word2Vec Clustering Model
def create_Clusters(w2vModel,kClusters):
"""
create cluster model
"""
# Set up kmeans model
kmeansOut = cluster.KMeans(n_clusters=kClusters,
random_state=12345)
# Capture dictionary
dictOut = w2vModel.wv.key_to_index.keys()
# Get Embeddings
embeddingsOut = w2vModel.wv[dictOut]
# Cluster
kmeansOut.fit(embeddingsOut)
return kmeansOut.labels_, embeddingsOut, dictOut
# Suppress warnings
warnings.filterwarnings(action = 'ignore')
# General variables
## data path
data_path = "/Users/johnbrooks/Dropbox/R_files/Users/johnbrooks/Dropbox/Synced/R/STAT 5702/Store/"
## Use multiprocessing package to find the number of cores
cores= multiprocessing.cpu_count()
# Read in our data
c14df = load_excel(data_path,"c_14.xlsx")
c22df = load_excel(data_path,"c_22.xlsx")
c30df = load_excel(data_path,"c_30.xlsx")
c40df = load_excel(data_path,"c_40.xlsx")
c41df = load_excel(data_path,"c_41.xlsx")
c43df = load_excel(data_path,"c_43.xlsx")
c45df = load_excel(data_path,"c_45.xlsx")
c46df = load_excel(data_path,"c_46.xlsx")
c48df = load_excel(data_path,"c_48.xlsx")
# 1. Gensim Segment
## Segment variables
number_Iterations = 1
## First run
varInIt = c14df
xOut = process_tokens(varInIt)
dOut,DTMOut = dictionary_DTM(xOut)
ml14, c14 = get_coherence_for_set_DTM(dOut,DTMOut,xOut,10)
c14df['prep_sentences'] = detokenize_for_sk(xOut)
## Bootstrap number of topics by recalculating coherence and taking median of bootstraps
### We add 2 because the index is returned
#### The indicies indicate the number of topic where index 0 = 2 topics, 1 = 3...
top14 = rep_coherence(dOut,DTMOut,xOut,number_Iterations) + 2
#print(top14)
varInIt = c22df
xOut = process_tokens(varInIt)
dOut,DTMOut = dictionary_DTM(xOut)
ml22, c22 = get_coherence_for_set_DTM(dOut,DTMOut,xOut,10)
c22df['prep_sentences'] = detokenize_for_sk(xOut)
top22 = rep_coherence(dOut,DTMOut,xOut,number_Iterations) + 2
#print(top22)
varInIt = c30df
xOut = process_tokens(varInIt)
dOut,DTMOut = dictionary_DTM(xOut)
ml30, c30 = get_coherence_for_set_DTM(dOut,DTMOut,xOut,10)
c30df['prep_sentences'] = detokenize_for_sk(xOut)
top30 = rep_coherence(dOut,DTMOut,xOut,number_Iterations) + 2
#print(top30)
varInIt = c40df
xOut = process_tokens(varInIt)
dOut,DTMOut = dictionary_DTM(xOut)
ml40, c40 = get_coherence_for_set_DTM(dOut,DTMOut,xOut,10)
c40df['prep_sentences'] = detokenize_for_sk(xOut)
top40 = rep_coherence(dOut,DTMOut,xOut,number_Iterations) + 2
#print(top40)
varInIt = c41df
xOut = process_tokens(varInIt)
dOut,DTMOut = dictionary_DTM(xOut)
ml41, c41 = get_coherence_for_set_DTM(dOut,DTMOut,xOut,10)
c41df['prep_sentences'] = detokenize_for_sk(xOut)
top41 = rep_coherence(dOut,DTMOut,xOut,number_Iterations) + 2
#print(top41)
varInIt = c43df
xOut = process_tokens(varInIt)
dOut,DTMOut = dictionary_DTM(xOut)
ml43, c43 = get_coherence_for_set_DTM(dOut,DTMOut,xOut,10)
c43df['prep_sentences'] = detokenize_for_sk(xOut)
top43 = rep_coherence(dOut,DTMOut,xOut,number_Iterations) + 2
#print(top43)
varInIt = c45df
xOut = process_tokens(varInIt)
dOut,DTMOut = dictionary_DTM(xOut)
ml45, c45 = get_coherence_for_set_DTM(dOut,DTMOut,xOut,10)
c45df['prep_sentences'] = detokenize_for_sk(xOut)
top45 = rep_coherence(dOut,DTMOut,xOut,number_Iterations) + 2
#print(top45)
varInIt = c46df
xOut = process_tokens(varInIt)
dOut,DTMOut = dictionary_DTM(xOut)
ml46, c46 = get_coherence_for_set_DTM(dOut,DTMOut,xOut,10)
c46df['prep_sentences'] = detokenize_for_sk(xOut)
top46 = rep_coherence(dOut,DTMOut,xOut,number_Iterations) + 2
#print(top46)
varInIt = c48df
xOut = process_tokens(varInIt)
dOut,DTMOut = dictionary_DTM(xOut)
ml48, c48 = get_coherence_for_set_DTM(dOut,DTMOut,xOut,10)
c48df['prep_sentences'] = detokenize_for_sk(xOut)
top48 = rep_coherence(dOut,DTMOut,xOut,number_Iterations) + 2
#print(top48)
# 2. SK learn Segment
## Reference
### https://towardsdatascience.com/latent-semantic-analysis-deduce-the-hidden-topic-from-the-document-f360e8c0614b
# After 10000 simulations we found that number of topics were 3 in question 10, 19, 22 and 30 but otherwise were 2
## Use single variable decomposition for the number of topics elucidated in the prior segment
te14, em14 = SVD_topic(c14df,3)
te22, em22 = SVD_topic(c22df)
te30, em30 = SVD_topic(c30df,3)
te40, em40 = SVD_topic(c40df,3)
te41, em41 = SVD_topic(c41df)
te43, em43 = SVD_topic(c43df)
te45, em45 = SVD_topic(c45df)
te46, em46 = SVD_topic(c46df)
te48, em48 = SVD_topic(c48df,3)
## Write out results
with pd.ExcelWriter(os.path.join(data_path, "wordsOut.xlsx")) as writer:
em14.to_excel(writer, sheet_name='c_14')
em22.to_excel(writer, sheet_name='c_22')
em30.to_excel(writer, sheet_name='c_30')
em40.to_excel(writer, sheet_name='c_40')
em41.to_excel(writer, sheet_name='c_41')
em43.to_excel(writer, sheet_name='c_43')
em45.to_excel(writer, sheet_name='c_45')
em46.to_excel(writer, sheet_name='c_46')
em48.to_excel(writer, sheet_name='c_48')
with pd.ExcelWriter(os.path.join(data_path, "topicOut.xlsx")) as writer:
te14.to_excel(writer, sheet_name='c_14')
te22.to_excel(writer, sheet_name='c_22')
te30.to_excel(writer, sheet_name='c_30')
te40.to_excel(writer, sheet_name='c_40')
te41.to_excel(writer, sheet_name='c_41')
te43.to_excel(writer, sheet_name='c_43')
te45.to_excel(writer, sheet_name='c_45')
te46.to_excel(writer, sheet_name='c_46')
te48.to_excel(writer, sheet_name='c_48')
# 3. Word2Vec Segment
## Model the topic to find synonyms using LSA model insights
model14 = create_sg_model(c14df)
t14t1 = model14.wv.most_similar('work')[:10]
t14t2 = model14.wv.most_similar('train')[:10]
t14t3 = model14.wv.most_similar('tool')[:10]
model22 = create_sg_model(c22df)
t22t1 = model22.wv.most_similar('email')[:10]
t22t2 = model22.wv.most_similar('team')[:10]
model30 = create_sg_model(c30df)
t30t1 = model30.wv.most_similar('servic')[:10]
t30t2 = model30.wv.most_similar('burden')[:10]
model40 = create_sg_model(c40df)
t40t1 = model40.wv.most_similar('project')[:10]
t40t2 = model40.wv.most_similar('procur')[:10]
t40t3 = model40.wv.most_similar('fund')[:10]
model41 = create_sg_model(c41df)
t41t1 = model41.wv.most_similar('time')[:10]
t41t2 = model41.wv.most_similar('servic')[:10]
model43 = create_sg_model(c43df)
t43t1 = model43.wv.most_similar('time')[:10]
t43t2 = model43.wv.most_similar('respons')[:10]
model45 = create_sg_model(c45df)
t45t1 = model45.wv.most_similar('manag')[:10]
t45t2 = model45.wv.most_similar('time')[:10]
model46 = create_sg_model(c46df)
t46t1 = model46.wv.most_similar('home')[:10]
t46t2 = model46.wv.most_similar('provid')[:10]
model48 = create_sg_model(c48df)
t48t1 = model48.wv.most_similar('listen')[:10]
t48t2 = model48.wv.most_similar('feedback')[:10]
t48t3 = model48.wv.most_similar('client')[:10]
# note that as soon as the vocab is updated the corpus is updated
# model.build_vocab(tokenized_sents, update = False)
## Clustering of Word 2 Vec Models
### https://ai.intelligentonlinetools.com/ml/k-means-clustering-example-word2vec/
### https://www.datanovia.com/en/lessons/determining-the-optimal-number-of-clusters-3-must-know-methods/
### https://scikit-learn.org/stable/modules/clustering.html
# Find the number of topics with a limit of 10 topics
topW14 = find_silhouettes(model14)
topW22 = find_silhouettes(model22)
topW30 = find_silhouettes(model30)
topW40 = find_silhouettes(model40)
topW41 = find_silhouettes(model41)
topW43 = find_silhouettes(model43)
topW45 = find_silhouettes(model45)
topW46 = find_silhouettes(model46)
topW48 = find_silhouettes(model48)
# Create the clustering labels
kO14, emb14, Dic14 = create_Clusters(model14,topW14)
kO22, emb22, Dic22 = create_Clusters(model22,topW22)
kO30, emb30, Dic30 = create_Clusters(model30,topW30)
kO40, emb40, Dic40 = create_Clusters(model40,topW40)
kO41, emb41, Dic41 = create_Clusters(model41,topW41)
kO43, emb43, Dic43 = create_Clusters(model43,topW43)
kO45, emb45, Dic45 = create_Clusters(model45,topW45)
kO46, emb46, Dic46 = create_Clusters(model46,topW46)
kO48, emb48, Dic48 = create_Clusters(model48,topW48)
# Make the embedings into frames
dfemb14 = pd.DataFrame(emb14)
dfemb14['dict'] = Dic14
dfemb14['labs'] = kO14
dfemb22 = pd.DataFrame(emb22)
dfemb22['dict'] = Dic22
dfemb22['labs'] = kO22
dfemb30 = pd.DataFrame(emb30)
dfemb30['dict'] = Dic30
dfemb30['labs'] = kO30
dfemb40 = pd.DataFrame(emb40)
dfemb40['dict'] = Dic40
dfemb40['labs'] = kO40
dfemb41 = pd.DataFrame(emb41)
dfemb41['dict'] = Dic41
dfemb41['labs'] = kO41
dfemb43 = pd.DataFrame(emb43)
dfemb43['dict'] = Dic43
dfemb43['labs'] = kO43
dfemb45 = pd.DataFrame(emb45)
dfemb45['dict'] = Dic45
dfemb45['labs'] = kO45
dfemb46 = pd.DataFrame(emb46)
dfemb46['dict'] = Dic46
dfemb46['labs'] = kO46
dfemb48 = pd.DataFrame(emb48)
dfemb48['dict'] = Dic48
dfemb48['labs'] = kO48
# Integrate into r with reticulate
## https://rstudio.github.io/reticulate/articles/r_markdown.html
Here the topics are presented including their interpretation and top 10 associated words. This is followed by the use of a keyword that feeds into a word to vector model. The idea for this was that one could visualize the topic but then use the context surrounding a word to see what other words are similar in usage. Even if the number of words is small in the corpus overall, if the particular word / concept was consistently attached to the key word, sometimes even exclusively, this with would be highlighted. In essence the topic analysis gave themes and the word embedding analysis gave insights into particular concepts related to those topic. A sum of vectors to express a topic was also considered but not implemented presently.
# Read out python list variables for better presentation
feedTwoByTwo <- function(pythonList){
newVector <- unlist(pythonList)
indiciesVectO <- seq(1,length(newVector),2)
indiciesVectE <- seq(2,length(newVector),2)
return(data.frame(word = newVector[indiciesVectO],
similarity = newVector[indiciesVectE]))
}
# Draw the top 10 words associated with a select topic
drawTop10 <- function(dfInD,topicOfInterest = 1){
orderVect <- order(dfInD[,topicOfInterest],decreasing = TRUE)
return(dfInD$word[orderVect[1:10]])
}
# Question 10. Training / Tools Desired
## Topic Interpretation: working from home
drawTop10(py$em14,1)
## [1] "train" "monitor" "would" "work" "home" "laptop" "use"
## [8] "second" "tool" "phone"
## Word tested: "work"
(feedTwoByTwo(py$t14t1))
## Topic Interpretation: training concerns - want formalized / online / at home
drawTop10(py$em14,2)
## [1] "train" "onlin" "team" "learn" "tool" "excel" "new"
## [8] "time" "applic" "softwar"
## Word tested: "train"
(feedTwoByTwo(py$t14t2))
## Topic Interpretation: tools available / printer and scanner more than software
drawTop10(py$em14,3)
## [1] "printer" "scanner" "train" "offic" "tool" "onlin" "import"
## [8] "inform" "portabl" "effici"
## Word tested: "tool"
(feedTwoByTwo(py$t14t3))
# Question 17. When communicating with my clients
## Topic Interpretation: email as most important tool
drawTop10(py$em22,1)
## [1] "email" "phone" "team" "ms" "combin" "person"
## [7] "winfast" "depend" "situat" "telephon"
## Word tested: "email"
(feedTwoByTwo(py$t22t1))
## Topic Interpretation: team work (ms teams)
drawTop10(py$em22,2)
## [1] "depend" "situat" "ms" "team" "method" "need" "best" "time"
## [9] "sever" "type"
## Word tested: "team"
(feedTwoByTwo(py$t22t2))
#(feedTwoByTwo(py$t22t3))
# Question 19. Prevents client service
## Note that a negative result category (i.e., no response) was skipped
## Topic Interpretation: service quality
drawTop10(py$em30,2)
## [1] "servic" "provid" "except" "client" "noth" "time" "prevent"
## [8] "issu" "custom" "expect"
## Word tested: "servic"
(feedTwoByTwo(py$t30t1))
## Topic Interpretation: impediments / burden of administration
drawTop10(py$em30,3)
## [1] "noth" "prevent" "except" "custom" "servic" "give"
## [7] "burden" "case" "administr" "ext"
## Word tested: "burden"
(feedTwoByTwo(py$t30t2))
#(feedTwoByTwo(py$t30t3))
# Question 22. Policy improvement
## Topic Interpretation: project management / projection process
drawTop10(py$em40,1)
## [1] "process" "project" "polici" "manag" "improv" "need" "time"
## [8] "work" "client" "could"
## Word tested: "project"
(feedTwoByTwo(py$t40t1))
## Topic Interpretation: procurement
drawTop10(py$em40,2)
## [1] "process" "procur" "furnitur" "project" "cumbersom" "approv"
## [7] "streamlin" "purchas" "lengthi" "hire"
## Word tested: "procur"
(feedTwoByTwo(py$t40t2))
## Topic Interpretation: funding distribution
drawTop10(py$em40,3)
## [1] "project" "fund" "could" "improv" "report" "branch" "system"
## [8] "financi" "manag" "rmc"
## Word tested: "fund"
(feedTwoByTwo(py$t40t3))
# Question 23. Successes
## Topic Interpretation: service / client responsibility excellence
drawTop10(py$em41,1)
## [1] "client" "servic" "provid" "team" "work" "need" "time"
## [8] "alway" "excel" "request"
## Word tested: "time"
(feedTwoByTwo(py$t41t1))
## Topic Interpretation: services provided
drawTop10(py$em41,2)
## [1] "servic" "provid" "excel" "team" "secur" "custom" "support"
## [8] "proud" "work" "employe"
## Word tested: "servic"
(feedTwoByTwo(py$t41t2))
#(feedTwoByTwo(py$t41t3))
# Question 25. Compliments to respondant
## Topic Interpretation: time / timeliness is good
drawTop10(py$em43,1)
## [1] "client" "provid" "time" "servic" "respons" "request" "need"
## [8] "inform" "quick" "help"
## Word tested: "time"
(feedTwoByTwo(py$t43t1))
## Topic Interpretation: fast and accurate communication / response
drawTop10(py$em43,2)
## [1] "respons" "quick" "accur" "reliabl" "detail" "simpl" "time"
## [8] "clear" "prompt" "written"
## Word tested: "respons"
(feedTwoByTwo(py$t43t2))
#(feedTwoByTwo(py$t43t3))
# Question 27. Services received by respondant
## Topic Interpretation: managers qualities / support
drawTop10(py$em45,1)
## [1] "servic" "time" "respons" "provid" "work" "help" "team"
## [8] "manag" "quick" "need"
## Word tested: "manag"
(feedTwoByTwo(py$t45t1))
## Topic Interpretation: time / timeliness is good
drawTop10(py$em45,2)
## [1] "respons" "quick" "servic" "easi" "speed" "fast"
## [7] "mutual" "assist" "request" "reaction"
## Word tested: "time"
(feedTwoByTwo(py$t45t2))
#(feedTwoByTwo(py$t45t3))
# Question 28. Impact of restrictions
## Topic Interpretation: working from home not impactful
drawTop10(py$em46,1)
## [1] "servic" "provid" "client" "work" "excel" "home" "covid" "impact"
## [9] "abil" "offic"
## Word tested: "home"
(feedTwoByTwo(py$t46t1))
## Topic Interpretation: what can be offered has not changed
drawTop10(py$em46,2)
## [1] "servic" "provid" "excel" "impact" "still" "abil" "client"
## [8] "best" "chang" "continu"
## Word tested: "provid"
(feedTwoByTwo(py$t46t2))
#(feedTwoByTwo(py$t46t3))
# Question 30. General Comments
## Topic Interpretation: appreciation / being listened to
drawTop10(py$em48,1)
## [1] "thank" "time" "work" "listen" "much" "comment"
## [7] "merci" "would" "servic" "consider"
## Word tested: "listen"
(feedTwoByTwo(py$t48t1))
## Topic Interpretation: appreciation / comments of service and needs
drawTop10(py$em48,2)
## [1] "work" "servic" "client" "time" "fab" "employe" "home"
## [8] "like" "would" "provid"
## Word tested: "feedback"
(feedTwoByTwo(py$t48t2))
## Topic Interpretation: client experience unimpaired
drawTop10(py$em48,3)
## [1] "time" "noth" "consider" "allevi" "oversight" "unrealist"
## [7] "take" "gather" "line" "best"
## Word tested: "client"
(feedTwoByTwo(py$t48t3))
Some of the word embedding projections were quite reasonable while others are hard to interpret. For example, in question 10 the “train” topic did identify the desire for formal online learning. The “tool” topic did capture that printer and scanner were among the most desirable additions for tools. Question 19 did indeed also have a clear link between “burden” of work and bureaucracy. Question 22 strongly identified projection processes as being unpleasant. Where it was not successful it may be that the select key term is not close enough to the focus of a topic / mentioned often making co-occurrence very difficult to ascertain with certainty. I had considered creating a combination vector to assess distance between term as an alternate topic representation although that is not presented in the current analysis.
# TSNE - Week 4 Day 2
## https://cran.r-project.org/web/packages/ggrepel/vignettes/ggrepel.html
## https://distill.pub/2016/misread-tsne/
tsneTheData <- function(embeddingFrame, perplexitySelect = 20) {
# Make the TSNE
get.a.tsne = Rtsne(embeddingFrame[,-c(ncol(embeddingFrame)-1,ncol(embeddingFrame))],
dims = 2,
perplexity = perplexitySelect)
# Initialize frame
outTsne <- data.frame(Dim1 = get.a.tsne$Y[,1],
Dim2 = get.a.tsne$Y[,2],
words = as.matrix(embeddingFrame[,ncol(embeddingFrame)-1]),
topics = as.matrix(embeddingFrame[,ncol(embeddingFrame)])
)
return(outTsne)
}
# Note that words presented have been stemmed and only a portion of the words are presented for ease of viewing
# Question 10. Training / Tools Desired (9 Topics)
dfOfIntC <- emb14fd
currentTSNE <- tsneTheData(dfOfIntC)
indexMaster <- 1:nrow(dfOfIntC)
# Master plot
ggplot(currentTSNE, aes(x=Dim1, y=Dim2, colour=topics)) +
geom_point() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Electronic devices
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 0],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Software packages
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 1],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Mental health and Security
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 2],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Interpretation: Operations
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 3],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Interpretation: Spending
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 4],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Experience with others
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 5],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Interpretation: Software packages
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 6],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Telephone
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 7],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Computer peripherals
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 8],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Question 17. When communicating with my clients
dfOfIntC <- emb22fd
currentTSNE <- tsneTheData(dfOfIntC)
indexMaster <- 1:nrow(dfOfIntC)
# Master plot
ggplot(currentTSNE, aes(x=Dim1, y=Dim2, colour=topics)) +
geom_point() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: MS Teams and Telephone
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 0],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Phone and Email
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 1],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Question 19. Prevents client service
dfOfIntC <- emb30fd
currentTSNE <- tsneTheData(dfOfIntC)
indexMaster <- 1:nrow(dfOfIntC)
# Master plot
ggplot(currentTSNE, aes(x=Dim1, y=Dim2, colour=topics)) +
geom_point() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Administrative concerns
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 0],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Interpretation: Silo's / Seperation
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 1],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Question 22. Policy improvement
dfOfIntC <- emb40fd
currentTSNE <- tsneTheData(dfOfIntC)
indexMaster <- 1:nrow(dfOfIntC)
# Master plot
ggplot(currentTSNE, aes(x=Dim1, y=Dim2, colour=topics)) +
geom_point() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Policies impacting workload
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 0],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Financial
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 1],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Question 23. Successes
dfOfIntC <- emb41fd
currentTSNE <- tsneTheData(dfOfIntC)
indexMaster <- 1:nrow(dfOfIntC)
# Master plot
ggplot(currentTSNE, aes(x=Dim1, y=Dim2, colour=topics)) +
geom_point() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Stability
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 0],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Bringing things together
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 1],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Question 25. Compliments to respondent
dfOfIntC <- emb43fd
currentTSNE <- tsneTheData(dfOfIntC)
indexMaster <- 1:nrow(dfOfIntC)
# Master plot
ggplot(currentTSNE, aes(x=Dim1, y=Dim2, colour=topics)) +
geom_point() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Efficiency
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 0],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Management and Praise
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 1],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Question 27. Services received by respondant (10)
dfOfIntC <- emb45fd
currentTSNE <- tsneTheData(dfOfIntC)
indexMaster <- 1:nrow(dfOfIntC)
# Master plot
ggplot(currentTSNE, aes(x=Dim1, y=Dim2, colour=topics)) +
geom_point() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Immediate and skilled
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 0],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Collaboration
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 1],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Interpretation: Courteous and Strong
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 2],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Approval
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 3],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Empathy and Wellbeing
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 4],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Teaching and Tools
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 5],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Enjoyable Experience
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 6],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Location / Division
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 7],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Interpretation: Fast and Quality
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 8],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Interpretation: Caring, Resilience
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 9],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Question 28. Impact of restrictions
dfOfIntC <- emb46fd
currentTSNE <- tsneTheData(dfOfIntC)
indexMaster <- 1:nrow(dfOfIntC)
# Master plot
ggplot(currentTSNE, aes(x=Dim1, y=Dim2, colour=topics)) +
geom_point() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: working together
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 0],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: ####################################################################
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 1],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Question 30. General Comments (9)
dfOfIntC <- emb48fd
currentTSNE <- tsneTheData(dfOfIntC)
indexMaster <- 1:nrow(dfOfIntC)
# Master plot
ggplot(currentTSNE, aes(x=Dim1, y=Dim2, colour=topics)) +
geom_point() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Self care
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 0],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Building / Creating
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 1],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Responsiveness
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 2],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Interpretation: Working together
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 3],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Administrative Ostacles and Responsiveness
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 4],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Guidance
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 5],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Interpretation: Detachment / Attachment / Mental Health
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 6],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())

# Interpretation: Innovation and Improvement
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 7],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Interpretation: Working at a Distance
subsamplePlot <- unique(sample(indexMaster[dfOfIntC$labs == 8],40, replace = TRUE))
ggplot(currentTSNE[subsamplePlot,], aes(x=Dim1, y=Dim2, label=words)) +
geom_text_repel() +
theme(legend.title = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
